<<<<<<< HEAD <<<<<<< HEAD ======= >>>>>>> c34b526e4f39c14e6ca6579cf2dd5726cda490a5 ======= >>>>>>> c34b526e4f39c14e6ca6579cf2dd5726cda490a5 Changing Cupcake Parameters <<<<<<< HEAD <<<<<<< HEAD ======= ======= >>>>>>> c34b526e4f39c14e6ca6579cf2dd5726cda490a5 <<<<<<< HEAD >>>>>>> c34b526e4f39c14e6ca6579cf2dd5726cda490a5 ======= >>>>>>> c34b526e4f39c14e6ca6579cf2dd5726cda490a5 <<<<<<< HEAD <<<<<<< HEAD
<<<<<<< HEAD <<<<<<< HEAD
======= ======= >>>>>>> c34b526e4f39c14e6ca6579cf2dd5726cda490a5
<<<<<<< HEAD >>>>>>> c34b526e4f39c14e6ca6579cf2dd5726cda490a5 ======= >>>>>>> c34b526e4f39c14e6ca6579cf2dd5726cda490a5

Aims

To evaluate consequences of reducing cupcake parameters (coverage identity) from 99% (default) to 95% on whole transcriptome
- Number of isoforms and genes from SQANTI filtered report using the two different praameters
- Explore the additional isoforms that are retained from 95% –> lower FL count in general?
- Visually check additional isoforms

Previously using ERCC, demonstrated with ERCC that cupcake’s coverage default parameter at 99% is too stringent, with 20 lowly-expressed ERCCs discarded. therefore, reran demux pipeline on merged whole transcriptome data (n = 12) but using a lower coverage threshold (95%).

<<<<<<< HEAD <<<<<<< HEAD

Mapping alignment of merged whole transcriptome data further showed signicant number of transcripts that had a lower coverage, and would have been discarded, therefore, reran demux pipeline on merged whole transcriptome data (n = 12) but using a lower coverage threshold (85%).

======= >>>>>>> c34b526e4f39c14e6ca6579cf2dd5726cda490a5 ======= >>>>>>> c34b526e4f39c14e6ca6579cf2dd5726cda490a5

Method

Checking the number of transcripts throughout pipeline from alignment with mouse genome (Minimap2):
1. Cupcake collapse (coverage identity at default threshold - 99% or at reduced threshold - 95%)
2. Cupcake filter
3. SQANTI characterisation
4. SQANTI filtering
5. TAMA filtering

Two analysis were done in parallel with the only difference in the cupcake coverage identity parameter (at cupcake collapse).

Pipeline

Pipeline for analysis

Pipeline for analysis

# function to read in the individual files 
read_inputfile <- function(path_inputfile, sqanti_gtf, file_option){
  #cat("processing", path_inputfile,"\n")
  
  if (file_option == "cupcake"){
    df <-  read.table(path_inputfile, as.is = T, sep = "\t", header = T)%>% 
      mutate(cluster = paste0("PB.",word(.$pbid, c(2), sep = fixed("."))))
  } else if (file_option == "sqanti_discard"){
    df <-  read.table(path_inputfile, as.is = T, sep = ",", header = T) %>% 
      mutate(cluster = paste0("PB.",word(.$filtered_isoform, c(2), sep = fixed(".")))) 
  } else if (file_option == "tama_discard"){
    df <-  read.table(path_inputfile, as.is = T) %>% 
      mutate(id = paste0("PB.",word(.$V4, c(2), sep = fixed(";")))) 
  } else if (file_option == "ignore"){
    df <- read.table(path_inputfile, as.is = T, sep = "\t")
  } else if (file_option == "sqanti"){
    df <- read.table(path_inputfile, as.is = T, sep = "\t", header = TRUE) %>% 
      mutate(id = paste0(ifelse(structural_category != "fusion", 
                         word(associated_gene,c(1), sep ="_"),associated_gene),
                         "/",associated_transcript,"/",structural_category,"/", chrom, "/", length)) 
    # Create a new attribute called "novelGene"
    df$novelGene <- "Annotated Genes"
    df[grep("novelGene", df$associated_gene), "novelGene"] <- "Novel Genes"
    df$novelGene = factor(df$novelGene, levels = c("Novel Genes","Annotated Genes"),ordered=TRUE)
    
    df <- read_merge_gtf(sqanti_gtf,df)
  } else {
    print("Check arguments")
  }
  
  return(df)
}

# assume the files are the same in the two analysis, only difference is the directory name
all_read <- function(root_dir){
  all <- list(
  read_inputfile(paste0(root_dir,"TOFU/WholeIsoSeq.ignored_ids.txt"),"NA", "ignore"),                  # cupcake collapse ignored isoforms 
  read_inputfile(paste0(root_dir,"TOFU/WholeIsoSeq.collapsed.abundance.txt"),"NA", "cupcake"),   # cupcake collapse 
  read_inputfile(paste0(root_dir,"TOFU/WholeIsoSeq.collapsed.filtered.abundance.txt"),"NA","cupcake"),  # cupcake filter 
  read_inputfile(paste0(root_dir,"SQANTI/WholeIsoSeq.collapsed.filtered_classification.txt"),
                 paste0(root_dir,"SQANTI/WholeIsoSeq.collapsed.filtered_corrected.gtf"), "sqanti"), # sqanti characterisation 
  read_inputfile(paste0(root_dir,"SQANTI/WholeIsoSeq.collapsed.filtered_classification.filtered_lite_classification.txt"),
                 paste0(root_dir,"SQANTI/WholeIsoSeq.collapsed.filtered_classification.filtered_lite.gtf"), "sqanti"), # sqanti filter
  read_inputfile(paste0(root_dir,"SQANTI/WholeIsoSeq.collapsed.filtered_classification.filtered_lite_reasons.txt"),"NA","sqanti_discard"), # sqanti filtering reason
  read_inputfile(paste0(root_dir,"SQANTI_TAMA_FILTER/WholeIsoSeq.collapsed_sqantitamafiltered.classification.txt"),
                 paste0(root_dir,"SQANTI_TAMA_FILTER/WholeIsoSeq.collapsed_sqantitamafiltered.classification.gtf"), "sqanti"), # sqanti after tama filtering
  read_inputfile(paste0(root_dir,"SQANTI_TAMA_FILTER/tama_retained_pbid.txt"),"NA", "ignore"), # tama retained
  read_inputfile(paste0(root_dir,"TAMA_Filter/WholeIsoSeq_discarded.txt"),"NA", "tama_discard") # tama ignore
)
  names(all) <- c("cup_coll_ign","cup_coll","cup_fil","sq","sq_fil","sq_ign","sq_tama","sq_tama_id","sq_tama_discard")
  return(all)
}

# coverage threshold at 99%
<<<<<<< HEAD
<<<<<<< HEAD
testing_para <- "/gpfs/mrc0/projects/Research_Project-MRC148213/sl693/IsoSeq/Whole_Transcriptome/OLD_2020/All_Merged/Testing_Parameters"
c99_dir <- paste0(testing_para,"/DEMUX_CUSTOM/")
c99 <- all_read(c99_dir)

# coverage threshold at 95%
c95_dir <- paste0(testing_para,"/DEMUX_CUSTOM_ADJUST/")
c95 <- all_read(c95_dir)

# coverage threshold at 85%
c85_dir <-  paste0(testing_para,"/DEMUX_c85/")
c85 <- all_read(c85_dir)

Mapping

======= ======= >>>>>>> c34b526e4f39c14e6ca6579cf2dd5726cda490a5 c99_dir <- "/gpfs/mrc0/projects/Research_Project-MRC148213/sl693/IsoSeq/Whole_Transcriptome/All_Merged/DEMUX_CUSTOM/" c99 <- all_read(c99_dir) # coverage threshold at 95% c95_dir <- "/gpfs/mrc0/projects/Research_Project-MRC148213/sl693/IsoSeq/Whole_Transcriptome/All_Merged/DEMUX_CUSTOM_ADJUST/" c95 <- all_read(c95_dir)
<<<<<<< HEAD >>>>>>> c34b526e4f39c14e6ca6579cf2dd5726cda490a5 ======= >>>>>>> c34b526e4f39c14e6ca6579cf2dd5726cda490a5

Main Results

Cupcake Collapse - Discarded

As expected, all the transcripts discarded in 95% coverage dataset are within the 99% coverage dataset

Overlap

<<<<<<< HEAD >>>>>>> c34b526e4f39c14e6ca6579cf2dd5726cda490a5 ======= >>>>>>> c34b526e4f39c14e6ca6579cf2dd5726cda490a5

Filters

Cupcake Filter

Cupcake filter removes any 5’ degraded products. Interestingly, while cupcake collapse generates almost idental dataset between the two analyses, there is very low overlap between cupcake filter.

<<<<<<< HEAD <<<<<<< HEAD

=======

>>>>>>> c34b526e4f39c14e6ca6579cf2dd5726cda490a5 =======

>>>>>>> c34b526e4f39c14e6ca6579cf2dd5726cda490a5

Sqanti

99% Coverage

Genes - Num of Unique Isoforms
======= ======= >>>>>>> c34b526e4f39c14e6ca6579cf2dd5726cda490a5

Genes - Num of Unique Isoforms
<<<<<<< HEAD >>>>>>> c34b526e4f39c14e6ca6579cf2dd5726cda490a5 ======= >>>>>>> c34b526e4f39c14e6ca6579cf2dd5726cda490a5
Isoforms unique to coverage-99% dataset

Figure 1: Isoforms detected in SQANTI annotation using different Cupcake collapse parameters
UCSC genome browser track of isoforms annotated to a) Dcc gene and b) Ctnap5a gene from SQANTI annotation (pre-filtering). The same dataset was collapsed with Cupcake using either using coverage threshold of 99% (default) or 95% (determined experimentally using ERCCs).

Isoforms that were unique to the coverage-99% dataset are short-monoexonic isoforms at the 5’ and 3’ end (boxed blue).

95% Coverage

Genes - Num of Unique Isoforms
======= ======= >>>>>>> c34b526e4f39c14e6ca6579cf2dd5726cda490a5

Genes - Num of Unique Isoforms
<<<<<<< HEAD >>>>>>> c34b526e4f39c14e6ca6579cf2dd5726cda490a5 ======= >>>>>>> c34b526e4f39c14e6ca6579cf2dd5726cda490a5
Isoforms unique to coverage-95% dataset

Figure 1: Isoforms detected in SQANTI annotation using different Cupcake collapse parameters
UCSC genome browser track of isoforms annotated to a) Ncam1 gene and b) Ide gene from SQANTI annotation (pre-filtering). The same dataset was collapsed with Cupcake using either using coverage threshold of 99% (default) or 95% (determined experimentally using ERCCs).

Isoforms that were unique to the coverage-99% dataset are multi-exonic transcripts that align well with the reference genome.

<<<<<<< HEAD <<<<<<< HEAD

85% Coverage

In 85%, but not in 99% or 95% coverage

Genes - Num of Unique Isoforms
======= >>>>>>> c34b526e4f39c14e6ca6579cf2dd5726cda490a5 ======= >>>>>>> c34b526e4f39c14e6ca6579cf2dd5726cda490a5

Post TAMA

99% Coverage

Genes - Num of Unique Isoforms

95% Coverage

Genes - Num of Unique Isoforms
======= ======= >>>>>>> c34b526e4f39c14e6ca6579cf2dd5726cda490a5

99% Coverage

Genes - Num of Unique Isoforms

95% Coverage

Genes - Num of Unique Isoforms
<<<<<<< HEAD >>>>>>> c34b526e4f39c14e6ca6579cf2dd5726cda490a5 ======= >>>>>>> c34b526e4f39c14e6ca6579cf2dd5726cda490a5
Isoforms unique to coverage-95% dataset

Figure 1: Isoforms detected in after applying TAMA filtering of partial transcripts to SQANTI-filtered dataset
UCSC genome browser track of isoforms annotated to a) Ncam1 gene and b) Cadm1 gene after applying TAMA’s filtering (tama_remove_fragment_models.py). The same dataset was collapsed with Cupcake using either using coverage threshold of 99% (default) or 95% (determined experimentally using ERCCs).

TAMA filtering removed redundant 5’ partial transcripts in both datasets. However, there are evidently some isoforms (incomplete-splice match) that are in the coverage-95% dataset and not in the coverage-99% dataset.

<<<<<<< HEAD <<<<<<< HEAD

NCAM1

NCAM1 <- function(){
  c99$sq %>% filter(gtf_coordinates %in% setdiff(c99$sq$gtf_coordinates,c95$sq$gtf_coordinates)) %>% 
  filter(associated_gene == "Ncam1")

# transcripts discarded by 99%, but kept in the 95% during cupcake collapse
c95_kept <- setdiff(c99$cup_coll_ign$V1,c95$cup_coll_ign$V1)
c95$coll_group <- read.table(paste0(c95_dir,"TOFU/WholeIsoSeq.collapsed.group.txt"))

#write.table(c95_kept,paste0(c95_dir,"TOFU/c95additionaltranscripts.txt"), quote = F, row.names = F, col.names = F)

### extract the FL reads (clustered) which were discarded by 99%, but kept in the 95%, and which pbid they belonged to (linux)
# All_Merged=/gpfs/mrc0/projects/Research_Project-MRC148213/sl693/IsoSeq/Whole_Transcriptome/All_Merged
#while read transcript; do grep -w $transcript $All_Merged/DEMUX_CUSTOM_ADJUST/TOFU/WholeIsoSeq.collapsed.group.txt; done < $All_Merged/DEMUX_CUSTOM_ADJUST/TOFU/c95additionaltranscripts.txt > $All_Merged/DEMUX_CUSTOM_ADJUST/TOFU/c95additionaltranscripts_pbid.txt 
c95$cup_coll_unique <- read.table(paste0(c95_dir,"TOFU/c95additionaltranscripts_pbid.txt")) %>% 
  left_join(., c95$sq[,c("isoform","associated_gene")], by = c("V1" = "isoform"))

c95$cup_fil <- read_gtf(paste0(c95_dir,"TOFU/WholeIsoSeq.collapsed.filtered.gff")) %>% full_join(.,c95$cup_fil, by = c("isoform" = "pbid"))
c99$cup_fil <- read_gtf(paste0(c99_dir,"TOFU/WholeIsoSeq.collapsed.filtered.gff")) %>% full_join(.,c99$cup_fil, by = c("isoform" = "pbid"))

# gtf coordinate for PB.16345.61 in c95
c95$cup_fil %>% filter(isoform == "PB.16345.61")
c99$cup_fil %>% filter(gtf_coordinates == "chr9 : 49517085 - 49569903")
# QC other coordinates
c95$cup_fil %>% filter(isoform == "PB.16345.26")
c99$cup_fil %>% filter(gtf_coordinates == "chr9 : 49505068 - 49557133")

# in 99% but not in 95% 
c99$cup_fil[c99$cup_fil$gtf_coordinates %in% 
              setdiff(c99$cup_fil %>% filter(cluster == "PB.16234") %>% .[,c("gtf_coordinates")],
                    c95$cup_fil %>% filter(cluster == "PB.16345")  %>% .[,c("gtf_coordinates")]),c("isoform","gtf_coordinates")] %>%
  left_join(c99$sq[,c("isoform","associated_gene","exons","structural_category","subcategory")], by = "isoform")

# in 95% but not in 99% 
View(c95$cup_fil[c95$cup_fil$gtf_coordinates %in% 
              setdiff(c95$cup_fil %>% filter(cluster == "PB.16345") %>% .[,c("gtf_coordinates")],
                    c99$cup_fil %>% filter(cluster == "PB.16234")  %>% .[,c("gtf_coordinates")]),c("isoform","gtf_coordinates")] %>% 
  left_join(c95$sq[,c("isoform","associated_gene","exons","structural_category","subcategory")], by = "isoform"))


c95$cup_fil %>% filter(cluster == "PB.16234")  
c99$cup_fil %>% filter(gtf_coordinates == "chr9 : 49505068 - 49557133")  
c95$cup_fil %>% filter(gtf_coordinates == "chr9 : 49505069 - 49567424")  

# 1.
c95$cup_coll_group <- read.table(paste0(c95_dir,"TOFU/WholeIsoSeq.collapsed.group.txt"))
c95$cup_coll_group%>% filter(V1 == "PB.16345.3")
c99$cup_coll_ign %>% filter(V1 %in% c("transcript/6640","transcript/7702"))

# 2.
c95$cup_coll_group%>% filter(V1 == "PB.16345.61")
c99$cup_coll_ign %>% filter(V1 %in% c("transcript/113982"))

# 4.
c95$cup_coll_group%>% filter(V1 == "PB.16345.59")
c99$cup_coll_ign %>% filter(V1 %in% c("transcript/114114"))

# counts still retained
#Ogfrl1, 83.6 chr1 : 23366439 - 23367069
# c95$cup_coll_unique[13,]
c95$cup_coll_unique[13,c("V2")]
c99$cup_coll_ign %>% filter(V1 %in% "transcript/256839")  # retained in c95 but discarded in c99
c95$cup_coll_group%>% filter(V1 == "PB.83.6")
c95$cup_fil %>% filter(isoform == "PB.83.6")

#Ogfrl1, 80.7 chr1 : 23366439 - 23367069
c99$cup_coll_group <- read.table(paste0(c99_dir,"TOFU/WholeIsoSeq.collapsed.group.txt"))
c99$cup_coll_group[grep("transcript/264944",c99$cup_coll_group$V2),] 
c99$cup_coll_group %>% filter(V1 == "PB.80.7")
c99$cup_fil %>% filter(isoform == "PB.80.7")

c95$sq %>% filter(isoform == "PB.83.6") %>% mutate(Total_Fl = sum(.[,48:59])) %>% .[,c("isoform","associated_gene","Total_Fl")]
c99$sq %>% filter(isoform == "PB.80.7") %>% mutate(Total_Fl = sum(.[,48:59])) %>% .[,c("isoform","associated_gene","Total_Fl")]

## 
c99$cup_readstat <- read.table(paste0(c99_dir,"TOFU/WholeIsoSeq.collapsed.read_stat.txt"), header = T)
c95$cup_readstat <- read.table(paste0(c95_dir,"TOFU/WholeIsoSeq.collapsed.read_stat.txt"), header = T)

c99$cup_readstat %>% filter(pbid == "PB.80.7")
c95$cup_readstat %>% filter(pbid == "PB.83.6")
}
======= ======= >>>>>>> c34b526e4f39c14e6ca6579cf2dd5726cda490a5
<<<<<<< HEAD >>>>>>> c34b526e4f39c14e6ca6579cf2dd5726cda490a5 ======= >>>>>>> c34b526e4f39c14e6ca6579cf2dd5726cda490a5